# 1、创建环境
from pyspark.context import SparkContext

sc = SparkContext(master='local', appName='demo2_map')

# 2、读取数据
students_rdd = sc.textFile("../../data/students.txt")

# 统计每个班级的平均年龄

# 取出班级和年龄
kv_rdd = students_rdd.map(lambda student: (student.split(",")[-1], int(student.split(",")[2])))

# 安装班级分组,将相同的key分到同一个组内,将相同key的value放到一个迭代器中
# 迭代器和普通列表的区别,列表的数据在内存中,迭代器数据可以在磁盘中
# groupByKey会产生shuffle
group_by_bey_rdd = kv_rdd.groupByKey()


# 组内计算平均年龄
def avg_age_fun(kv):
    clazz = kv[0]
    ages = kv[1]
    avg_age = round(sum(ages)/len(ages),2)
    return clazz, avg_age


avg_age_rdd = group_by_bey_rdd.map(avg_age_fun)

avg_age_rdd.foreach(print)

while True:
    pass
